
** Clearing Stata memory
capture log close
clear all
set more off, perm
set seed 1234

/////////////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////// Figure O.5: Standard Deviations of IRT Residuals ///////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////////////////


** Opening Phase 2 norm_scores dataset 
use "Work Data/Gender_Phase2_long.dta",clear

*********************************************************************************
**************** Main sample ****************************************************
*********************************************************************************

* 1) Only years before the affirmative action took place
drop if aa_year==1
tab year


**********************************************************************************
****************** Question's score by order *************************************
**********************************************************************************

forvalues i=1(1)12 {
gen score_item`i'=.
levelsof subject, local(levels) 
foreach s of local levels {
replace score_item`i'=`s'`i'_st2 if subject=="`s'"
replace score_item`i'=. if missing_p2_`s'`i'==1 & subject=="`s'"
}
replace score_item`i'=2* score_item`i' // IRT command does not allow noninteger values
tab subject, sum(score_item`i')
}

keep score_item1-score_item12 subject year sex female inscri2

save "Work Data/IRT_Data.dta", replace

**********************************************************************************
****************** Predicted scores***********************************************
**********************************************************************************

forvalues j=1(1)4 {
foreach i in hist biol geog math phy chem port lang {	
use "Work Data/IRT_Data.dta", clear
keep if year==200`j' & subject=="`i'"
capture replace score_item6=. if year==2003 & subject=="phy"
missings dropvars, force
irt grm score_item1-score_item12
predict ability_`i', latent
forvalues z=1(1)12 {
capture noisily predict c_it`i'`z'_* if subject=="`i'", outcome(score_item`z')
forvalues k=1(1)11 {
capture noisily gen c_it`i'`z'_`k'=0
}
gen pred_`i'_`z'=c_it`i'`z'_2*.5+c_it`i'`z'_3+c_it`i'`z'_4*1.5+c_it`i'`z'_5*2+c_it`i'`z'_6*2.5+c_it`i'`z'_7*3+c_it`i'`z'_8*3.5+c_it`i'`z'_9*4+c_it`i'`z'_10*4.5+c_it`i'`z'_11*5 if subject=="`i'"
}
keep pred_* subject year inscri2
save "Work Data/Predicted_Scores_200`j'_`i'.dta", replace
clear
}
}

**********************************************************************************
***************** Generating Dataset of Predicted Scores *************************
**********************************************************************************

forvalues j=1(1)4 {
use "Work Data/Predicted_Scores_200`j'_biol.dta", clear
drop subject
foreach i in hist geog math phy chem port lang {	
merge 1:1 inscri2 year using "Work Data/Predicted_Scores_200`j'_`i'.dta"
drop _merge
save "Work Data/Predicted_Scores_200`j'_all.dta", replace
sleep 5000
}
}

forvalues j=0(1)3 {
append using "Work Data/Predicted_Scores_200`j'_all.dta"
sleep 5000
}
save "Work Data/Predicted_Score_all.dta", replace

**********************************************************************************
************************************* IRT residuals *******************************
**********************************************************************************

** Opening Phase 2 norm_scores dataset 
use "Work Data/Gender_Phase2_long.dta",clear

*********************************************************************************
**************** Main sample ****************************************************
*********************************************************************************

* 1) Only years before the affirmative action took place
drop if aa_year==1
drop if year==2000
tab year

* 2) Drop Portuguese and Foreign Language (in Phase 1 there is no Portuguese or Foreign Language exams - For Portuguese Phase 1 has an essay)
 tab subject, sum(norm_p1score)
 drop if subject=="lang" | subject=="port" 
 tab subject, sum(norm_p1score)
 drop Language Portuguese prio_Language prio_Portuguese fem_prio_Language fem_prio_Portuguese 

*********************************************************************************
**************** Predicted score ************************************************
*********************************************************************************

merge m:1 inscri2 using "Work Data/Predicted_Score_all.dta"
keep if _merge==3
sum pred*

*********************************************************************************
**************** Residuals ******************************************************
*********************************************************************************

foreach v in biol chem hist phy geog math {
forvalues i=1(1)12 {
gen res_`v'_`i'=`v'`i'_st2-pred_`v'_`i'
sum res_`v'_`i'
}
}

forvalues i=1(1)12 {
gen pred_score_item`i'=.
gen res_score_item`i'=.
gen score_item`i'=.
levelsof subject, local(levels) 
foreach s of local levels {
replace pred_score_item`i'=pred_`s'_`i' if subject=="`s'"
replace res_score_item`i'=res_`s'_`i' if subject=="`s'"
replace score_item`i'=`s'`i'_st2 if subject=="`s'"
drop pred_`s'_`i' res_`s'_`i'

}
tab subject, sum(pred_score_item`i')
tab subject, sum(res_score_item`i')
tab subject, sum(score_item`i')

}
* Substituting for missing the values for Physicis in 2003 (everyone received maximum score)
capture replace pred_score_item=. if year==2003 & subject=="phy"
capture replace res_score_item=. if year==2003 & subject=="phy"

egen res_subject_sd=rowsd(res_score_item*)
sum res_subject_sd
label var res_subject_sd "Standard deviation of questions' residuals (within subject)"

*************  Priority versus non priority: Compare female and male

foreach v of varlist res_subject_sd {

local lab: variable label `v'

******** Priority subjects ********

** Distribution residuals
ksmirnov `v' if priority==1, by(female)
local pval=round(r(p),0.0001)

twoway (kdensity `v' if female==1 & priority==1, lcolor(purple) xlabel(#10) xtitle("") ytitle(""))   ///
(kdensity `v' if female==0 & priority==1, lcolor(green)  xlabel(#10) lpattern(dash)), /// 
 note("P-value of the Kolmogorov-Smirnov test = `pval'") /// 
legend(order(1 "Female" 2 "Male" )size(small) position(6) cols(2)) title("Priority", size(small))	saving("Output/prio_`v'.gph", replace) 

********  Non Priority subjects ********

** Distribution residuals
ksmirnov `v' if priority==0, by(female)
local pval=round(r(p),0.0001)

twoway (kdensity `v' if female==1 & priority==0, lcolor(purple) xlabel(#10) xtitle("") ytitle(""))   ///
(kdensity `v' if female==0 & priority==0, lcolor(green)  xlabel(#10) lpattern(dash)), /// 
 note("P-value of the Kolmogorov-Smirnov test = `pval'") /// 
legend(order(1 "Female" 2 "Male" )size(small) position(6) cols(2)) title("Non-priority", size(small))	saving("Output/nonprio_`v'.gph", replace) 

graph combine "Output/prio_`v'.gph" "Output/nonprio_`v'.gph", ycommon xcommon altshrink /*title("`lab'", size(medium))*/
graph export "Output/prio_nonprio_`v'.pdf", as(pdf) replace 
}


